We want to look at real datasets to simulate realistic datasets.

filterData = F
if (filterData){
  cluster = read.csv('retina_clusteridentities.txt', sep = '\t',
                     stringsAsFactors = F, header = F)
  rownames(cluster) = cluster$V1
  dropseq = read.csv('GSE63472_P14Retina_merged_digital_expression.txt',
                     sep = '\t', stringsAsFactors = F)
  prefilter = dropseq[, colnames(dropseq) %in% cluster$V1]
  selectCluster = cluster[colnames(prefilter), 'V2'] %in% c(1, 2, 7, 32, 37)
  prefilter = prefilter[, selectCluster]
  filter <- apply(prefilter > 10, 1, sum) >= 10
  postfilter <- prefilter[filter,]
  write.csv(postfilter, 'GSE63472_P14Retina_merged_digital_expression_filt.csv')
}

core = read.csv('GSE63472_P14Retina_merged_digital_expression_filt.csv',
                stringsAsFactors = F)
core = as.matrix(core)
cluster = read.csv('retina_clusteridentities.txt', sep = '\t',
                   stringsAsFactors = F, header = F)
rownames(cluster) = cluster$V1
bio = factor(cluster[colnames(core), 'V2'])
detection_rate <- colSums(core>0)
coverage <- colSums(core)

We will color-code the cells by inferred cluster (see paper ‘Highly Parallel Genome-wide Expression Profiling of Individual Cells Using Nanoliter Droplets’). Select cells from clusters 1,2,7,32, and 37 (corresponding to different cell classes in the retina), filter out the genes that do not have at least 10 counts in at least 10 cells. Dimensions of the dataset is

dim(core)
## [1]  703 1583

Note that after filtering, the number of genes is small than the number of cells (like in Zeisel dataset).

par(mfrow = c(1,2))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(core == 0), xlim = c(0,4), ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              main = '1,000 most variable genes')
smoothScatter(mean, rowMeans(core == 0), xlim = c(0,4),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = '1,000 most variable genes')

Fit data with K = 4, V and X intercepts in both Mu and Pi, and commondispersion = FALSE.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 4,
                                  commondispersion = FALSE)))
##     user   system  elapsed 
## 4367.869 1791.992 2296.210

True W

If we simulate W from real data, it will look like that.

pairs(zinb@W, col = cols[bio], pch = 19, main = "ZINB")

Gamma

gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]

df <- data.frame(gamma_mu = gamma_mu,
                 gamma_pi = gamma_pi,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.3535422      0.9075246  0.9488025
## gamma_pi       -0.3535422  1.0000000     -0.5263647 -0.4920994
## detection_rate  0.9075246 -0.5263647      1.0000000  0.9740914
## coverage        0.9488025 -0.4920994      0.9740914  1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = bio)
gamma = melt(gamma)

ggplot(gamma, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 50, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable) + xlab('gamma')

ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
  theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('gamma') + theme_bw()

Beta

beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]

df <- data.frame(beta_mu = beta_mu,
                 beta_pi = beta_pi)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.2221312
## beta_pi -0.2221312  1.0000000
beta = data.frame(beta_mu = beta_mu, beta_pi = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable, scales = 'free') + xlab('beta')

ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta removing outliers') +
  scale_x_discrete(breaks = c('', ''), drop=FALSE) +
  scale_y_continuous(limits = c(min, max))
## Warning: Removed 838 rows containing non-finite values (stat_boxplot).

Alpha

pairs(t(zinb@alpha_mu), main = 'alpha_mu')

pairs(t(zinb@alpha_pi), main = 'alpha_pi')

df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
                 alpha_mu_2 = zinb@alpha_mu[2, ],
                 alpha_mu_3 = zinb@alpha_mu[3, ],
                 alpha_mu_4 = zinb@alpha_mu[4, ],
                 alpha_pi_1 = zinb@alpha_pi[1, ],
                 alpha_pi_2 = zinb@alpha_pi[2, ],
                 alpha_pi_3 = zinb@alpha_pi[3, ],
                 alpha_pi_4 = zinb@alpha_pi[4, ])
pairs(df, pch=19)

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2   alpha_mu_3   alpha_mu_4  alpha_pi_1
## alpha_mu_1  1.00000000 -0.16924097 -0.439265936  0.096582896 -0.09114714
## alpha_mu_2 -0.16924097  1.00000000  0.283156270 -0.128433814 -0.19545680
## alpha_mu_3 -0.43926594  0.28315627  1.000000000 -0.001863545  0.10310579
## alpha_mu_4  0.09658290 -0.12843381 -0.001863545  1.000000000  0.25073828
## alpha_pi_1 -0.09114714 -0.19545680  0.103105786  0.250738282  1.00000000
## alpha_pi_2 -0.14822571  0.18279627  0.126350702 -0.261374516 -0.35861884
## alpha_pi_3  0.22894955 -0.01948612 -0.229933375  0.057926624  0.01539616
## alpha_pi_4 -0.15200914  0.21313809  0.115836116 -0.363158579 -0.37953663
##            alpha_pi_2  alpha_pi_3  alpha_pi_4
## alpha_mu_1 -0.1482257  0.22894955 -0.15200914
## alpha_mu_2  0.1827963 -0.01948612  0.21313809
## alpha_mu_3  0.1263507 -0.22993337  0.11583612
## alpha_mu_4 -0.2613745  0.05792662 -0.36315858
## alpha_pi_1 -0.3586188  0.01539616 -0.37953663
## alpha_pi_2  1.0000000 -0.14502794  0.44001938
## alpha_pi_3 -0.1450279  1.00000000 -0.04517799
## alpha_pi_4  0.4400194 -0.04517799  1.00000000

Dispersion

par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
     xlab = 'edgeR tagwise dispersion', main = 'Dispersion')

print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.589623
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')

par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))

Estimated mu and pi

par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=cols[bio], ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=cols[bio])

par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(0,4),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'True')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(0,4),
              xlab = "Estimated Mean Expression", main = 'Estimated',
              ylab = "Estimated Dropout Rate",ylim = c(0,1),
              colramp = pal)
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
     xlab = "Estimated Mean Expression", xlim = c(0,4),
     ylab = "Estimated Dropout Rate", pch=19,
     ylim = c(0, 1), main = 'Estimated')